#==============================================================================#
# 7-xwalk-locations.R
# Matt Curtis 25/11/22
# mjdcurtis@gmail.com
# updated Feb 14 2023
# updated May 29 2025
#------------------------------------------------------------------------------#
# use bailliage shapes and geonames lat lons to get coutumier data
#==============================================================================#


#library(tidyverse)
#library(data.table)
library(arrow)
library(tidytable)
library(readxl)
library(sf)
library(progress)
library(stringr)
#------------------------------------------------------------------------------#
rm(list = ls())
gc()

# setwd once
setwd("~/Replication package")
genidir<-"~/Replication package/"

#------------------------------------------------------------------------------#

# this one is a duplicate on custom_name!
bad<-"Coutumes locales des prevostez, seigneuries et chastellenies, lieux et villages du haut pays d'Auvergne"
# customs
list_customs <- read_excel( "./1_raw_data/1_2_geni/auxiliary_files/list-customs_FR.xlsx") %>% 
  mutate(partible  = str_to_title(`partible/impartible`)) %>% 
  mutate(excluded = female_excluded) %>% 
  filter(coutume_name_full!=bad)


# bailliages
list_bailliages <- read_excel("./1_raw_data/1_2_geni/auxiliary_files/brette_bailliages.xlsx",    
                              sheet = "brette_coutumes") 

# a few bailliages had IDs like XIV whereas the shapefile just has numbers
fix<-fread('./1_raw_data/1_2_geni/auxiliary_files/fr-fix-bailliages.csv') %>% 
  mutate(correction = as.character(correction))

list_bailliages<-list_bailliages %>% 
  left_join(fix) %>% 
  mutate(BAILLIAGE = case_when(
    !is.na(correction)~ correction,
    is.na(correction)~bailliage
  ) %>% as.numeric()) %>% 
  select(BAILLIAGE,coutume_name,generalite,bailliage_name) %>% 
  # fixing introduces some duplicates
  # for the duplicates, I preferred the second choice each time
  mutate(n = n(),r = row_number(),.by=BAILLIAGE) %>% 
  filter(n==1|r==2) %>% 
  select(-n,-r)
  
# link customs to bailliages
list_customs <-list_bailliages %>% 
  left_join(list_customs)


# shapefile

france <- read_sf(dsn = "./1_raw_data/1_2_geni/auxiliary_files/z-French-bailliages", layer = "BRETTE_BAILLIAGES_COMMUNE_temp")
crs <- st_crs(france)

# merge shapefile to customs
france <- left_join(france,
                    select(list_customs,
                           BAILLIAGE,
                           subregion= bailliage_name,
                           region=generalite,
                           Custom1 = coutume_name,
                           partible,
                           excluded,
                           impartible_type,
                           partible_type,
                           primo_ultimo='primogeniture/ultimogeniture',
                    )) %>% 
  st_as_sf()

# geocoded locations
locs<-read_parquet('./2_scripts/2_0_tempfiles/fr-geocoded-v3.parquet') %>% 
  tidytable()
before<-nrow(locs)

# convert locs to spatial data
locs_sf<-locs %>% 
  filter(!is.na(longitude)) %>% 
  distinct()%>%
  #  as_tibble %>% 
  st_as_sf(
    coords = c("longitude", "latitude"),
    crs="EPSG:4326",
    agr = "identity"
  ) %>%
  st_transform(crs = crs) 

# join the spatial data
locs_sf <- france %>%
  st_join(locs_sf, join = st_covers) %>% 
  st_set_geometry(NULL) %>% 
  select(locid,region,subregion,Custom1,partible,
         excluded,primo_ultimo,partible_type,impartible_type,
         BAILLIAGE,
         droitecrit) %>% distinct()



locs<-left_join(locs,locs_sf)
stopifnot(nrow(locs)==before)

# cities

fix<-fread("./1_raw_data/1_2_geni/auxiliary_files/fr-cities-manual-fix.csv") %>% 
  select(pop,latitude = geonames.lat,longitude=geonames.lng) 


locs<-left_join(locs,fix)
stopifnot(nrow(locs)==before)


# compute city distance

# convert to spatial
locs_sf<-locs %>% 
  filter(!is.na(longitude)) %>% 
  select(longitude,latitude,locid) %>% 
  distinct()%>%
  st_as_sf(
    coords = c("longitude", "latitude"),
    crs="EPSG:4326",
    agr = "identity"
  )

# convert to spatial
# conditional on 1k+
city_sf<-locs %>% 
  filter(pop>0) %>% 
  filter(!is.na(longitude)) %>% 
  select(longitude,latitude) %>% 
  mutate(long=longitude,lat=latitude) %>% 
  distinct()%>%
  st_as_sf(
    coords = c("longitude", "latitude"),
    crs="EPSG:4326",
    agr = "identity"
  )

# compute distance --- slow
print("Computing distance")
distance<-st_distance(locs_sf,city_sf)
min<-distance %>% matrixStats::rowMins()
locs_sf$distance<-min
print("Done")


# merge back to locs
merge_back<-data.table(
  locs_sf$distance,
  locs_sf$locid)

names(merge_back)<-c("city_distance","locid")

locs<-left_join(locs,merge_back)
stopifnot(nrow(locs)==before)

write_parquet(locs,'./2_scripts/2_0_tempfiles/fr-locations-customs.parquet')

